library(pacman)
p_load("tidyverse","litsearchr", "here","revtools", "igraph", "tidytext", "reactable", "readxl","waffle") 
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] waffle_1.0.1     readxl_1.4.1     reactable_0.3.0  tidytext_0.3.2  
##  [5] igraph_1.3.4     revtools_0.4.1   here_1.0.1       litsearchr_1.0.0
##  [9] forcats_0.5.2    stringr_1.4.1    dplyr_1.0.9      purrr_0.3.4     
## [13] readr_2.1.2      tidyr_1.2.0      tibble_3.1.8     ggplot2_3.3.6   
## [17] tidyverse_1.3.2  pacman_0.5.1    
## 
## loaded via a namespace (and not attached):
##  [1] googledrive_2.0.0    colorspace_2.0-3     ellipsis_0.3.2      
##  [4] modeltools_0.2-23    rprojroot_2.0.3      fs_1.5.2            
##  [7] rstudioapi_0.14      SnowballC_0.7.0      DT_0.22             
## [10] fansi_1.0.3          lubridate_1.8.0      xml2_1.3.3          
## [13] extrafont_0.18       cachem_1.0.6         knitr_1.40          
## [16] ade4_1.7-19          jsonlite_1.8.0       Rttf2pt1_1.3.10     
## [19] broom_1.0.0          dbplyr_2.1.1         shinydashboard_0.7.2
## [22] shiny_1.7.1          compiler_4.1.3       httr_1.4.4          
## [25] backports_1.4.1      assertthat_0.2.1     Matrix_1.5-1        
## [28] fastmap_1.1.0        lazyeval_0.2.2       gargle_1.2.1        
## [31] cli_3.3.0            later_1.3.0          htmltools_0.5.3     
## [34] tools_4.1.3          NLP_0.2-1            gtable_0.3.1        
## [37] glue_1.6.2           Rcpp_1.0.9           slam_0.1-50         
## [40] cellranger_1.1.0     jquerylib_0.1.4      vctrs_0.4.1         
## [43] extrafontdb_1.0      xfun_0.33            rvest_1.0.3         
## [46] mime_0.12            lifecycle_1.0.2      googlesheets4_1.0.1 
## [49] stringdist_0.9.8     MASS_7.3-58.1        scales_1.2.1        
## [52] hms_1.1.2            promises_1.2.0.1     parallel_4.1.3      
## [55] RColorBrewer_1.1-3   yaml_2.3.5           curl_4.3.2          
## [58] gridExtra_2.3        sass_0.4.2           stringi_1.7.8       
## [61] tokenizers_0.2.1     rlang_1.0.5          pkgconfig_2.0.3     
## [64] evaluate_0.16        lattice_0.20-45      htmlwidgets_1.5.4   
## [67] tidyselect_1.1.2     magrittr_2.0.3       R6_2.5.1            
## [70] topicmodels_0.2-12   generics_0.1.3       DBI_1.1.2           
## [73] pillar_1.8.1         haven_2.5.1          withr_2.5.0         
## [76] janeaustenr_1.0.0    modelr_0.1.9         crayon_1.5.1        
## [79] utf8_1.2.2           plotly_4.10.0        tzdb_0.3.0          
## [82] rmarkdown_2.16       grid_4.1.3           data.table_1.14.2   
## [85] reprex_2.0.2         digest_0.6.29        xtable_1.8-4        
## [88] tm_0.7-8             httpuv_1.6.6         stats4_4.1.3        
## [91] munsell_0.5.0        viridisLite_0.4.0    bslib_0.4.0

Import results

naive_results <- litsearchr::import_results(file="pubmed.nbib") %>% 
   # remove duplicates 
   litsearchr::remove_duplicates( field = "title", method = "exact") 
## Registered S3 methods overwritten by 'synthesisr':
##   method                     from    
##   c.bibliography             revtools
##   print.bibliography         revtools
##   summary.bibliography       revtools
##   as.data.frame.bibliography revtools
##   [.bibliography             revtools
## Reading file pubmed.nbib ... done
# Extract useful data fields
naive_results<- naive_results %>% 
    tibble::rownames_to_column(var="ID") %>% 
    separate(location_id, c("doi1","doi2","doi3"),
              sep = " and ") %>%
    mutate(DOI=ifelse(str_detect(doi1,"doi"),doi1,doi2),
           DOI= paste0("http://doi.org/",
                       str_remove(DOI, " \\[doi\\]")),
           ID=as.numeric(ID)) %>% 
    select(ID, author,date_published,title,journal,abstract,pubmed_id,DOI) 
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 4808 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

4904 publications were identified after duplication removal.

Extract Bag-of-Words from title, abstract and keyword

This process is known as tokenization, which into consecutive sequences of words that can be used in text mining. In this process we first extract pre-defined single potentially relevant to clustering analysis, 100 most frequent bigrams (two-word phrases) and top 50 most frequent trigrams (three-word phrases).

#combine all text fields
my_text<- tibble(ID = naive_results$ID, text = paste(naive_results$title,  
                                                        naive_results$abstract, 
                                                        naive_results$keywords))
library(tidytext)

# define words that are potentially related to clustering 
list<-c("subtype","cluster", "clustering","class","classes","latent", "subtype", "subtypes")

# extract single word tokens
words<-my_text %>% 
  group_by(ID) %>%
  unnest_tokens(word, text )  %>% 
  filter(!word %in% stop_words$word) %>% 
  group_by(word) %>% 
  mutate(n=n()) %>% 
  filter(word  %in% list ) 
  
# bigram
bigram<-my_text %>% 
  group_by(ID) %>%
  unnest_tokens(word, text ,token = "ngrams", n = 2)  %>% 
  separate(word, into = c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
          !word2 %in% stop_words$word) %>%
  unite(word, c(word1, word2), sep = " ") %>% 
  group_by(word) %>% 
  mutate(n=n()) %>% 
  filter(n>100)
  
# trigram
trgram<-my_text %>% 
  group_by(ID) %>%
  unnest_tokens(word, text ,token = "ngrams", n = 3)  %>% 
  separate(word, into = c("word1", "word2","word3"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
          !word2 %in% stop_words$word,
          !word3 %in% stop_words$word) %>%
  unite(word, c(word1, word2, word3 ), sep = " ") %>% 
  group_by(word) %>% 
  mutate(n=n())  %>% 
  filter(n>50)
  
# combine all words and check frequency
ngram<-rbind(words,bigram,trgram) %>% 
  select(-ID) %>% 
  distinct() %>% 
  arrange(n)

ngram$word
##   [1] "patient health questionnaire"    "resonance imaging fmri"         
##   [3] "adverse childhood experiences"   "healthy controls hcs"           
##   [5] "posttraumatic stress symptoms"   "parkinson's disease pd"         
##   [7] "compulsive disorder ocd"         "covid 19 related"               
##   [9] "resonance imaging mri"           "logistic regression models"     
##  [11] "type 2 diabetes"                 "controlled trial background"    
##  [13] "cross sectional survey"          "disease 2019 covid"             
##  [15] "positron emission tomography"    "ptsd symptom clusters"          
##  [17] "bipolar disorder bd"             "2019 covid 19"                  
##  [19] "logistic regression analysis"    "healthy controls hc"            
##  [21] "cognitive impairment mci"        "mental health care"             
##  [23] "middle income countries"         "mental health symptoms"         
##  [25] "genome wide association"         "traumatic brain injury"         
##  [27] "default mode network"            "coronavirus disease 2019"       
##  [29] "multinomial logistic regression" "central nervous system"         
##  [31] "psycinfo database record"        "body mass index"                
##  [33] "mental health services"          "latent class growth"            
##  [35] "cluster randomized trial"        "intimate partner violence"      
##  [37] "class analysis lca"              "structural equation modeling"   
##  [39] "low certainty evidence"          "traumatic stress disorder"      
##  [41] "health related quality"          "income countries"               
##  [43] "clinical practice"               "age sex"                        
##  [45] "emotion regulation"              "related quality"                
##  [47] "trial background"                "hyperactivity disorder adhd"    
##  [49] "cluster randomised controlled"   "secondary outcomes"             
##  [51] "significantly lower"             "cluster sampling"               
##  [53] "school students"                 "disorder adhd"                  
##  [55] "spectrum disorder asd"           "mental health outcomes"         
##  [57] "depressive disorder mdd"         "disorder asd"                   
##  [59] "disorder mdd"                    "clinical trials"                
##  [61] "obsessive compulsive disorder"   "ptsd symptom"                   
##  [63] "usual care"                      "childhood trauma"               
##  [65] "post traumatic stress"           "compulsive disorder"            
##  [67] "risk factor"                     "genome wide"                    
##  [69] "regression analyses"             "month follow"                   
##  [71] "personality disorders"           "multiple sclerosis"             
##  [73] "certainty evidence"              "gray matter"                    
##  [75] "nervous system"                  "health services"                
##  [77] "network analysis"                "previous studies"               
##  [79] "linear regression"               "cortical thickness"             
##  [81] "5 ht"                            "interval ci"                    
##  [83] "confidence interval ci"          "study examined"                 
##  [85] "rating scale"                    "eating disorder"                
##  [87] "post traumatic"                  "future research"                
##  [89] "sars cov"                        "cov 2"                          
##  [91] "sars cov 2"                      "traumatic stress"               
##  [93] "randomised controlled trial"     "cognitive performance"          
##  [95] "data driven"                     "factor analysis"                
##  [97] "major depression"                "study aims"                     
##  [99] "analysis revealed"               "longitudinal study"             
## [101] "regression analysis"             "functional magnetic resonance"  
## [103] "primary outcome"                 "functional magnetic"            
## [105] "latent classes"                  "depression anxiety"             
## [107] "population based"                "prefrontal cortex"              
## [109] "structural equation"             "statistically significant"      
## [111] "evidence based"                  "psychological distress"         
## [113] "brain regions"                   "physical health"                
## [115] "sleep quality"                   "stress disorder ptsd"           
## [117] "anxiety symptoms"                "mental illness"                 
## [119] "disorder ptsd"                   "cluster randomised"             
## [121] "odds ratio"                      "parkinson's disease"            
## [123] "dsm 5"                           "alzheimer's disease ad"         
## [125] "cognitive function"              "mild cognitive impairment"      
## [127] "gene expression"                 "cluster randomized controlled"  
## [129] "regression models"               "mild cognitive"                 
## [131] "randomised controlled"           "health related"                 
## [133] "obsessive compulsive"            "personality disorder"           
## [135] "suicidal ideation"               "12 months"                      
## [137] "disease ad"                      "6 months"                       
## [139] "increased risk"                  "study background"               
## [141] "cognitive decline"               "significant differences"        
## [143] "machine learning"                "deficit hyperactivity disorder" 
## [145] "deficit hyperactivity"           "symptom severity"               
## [147] "attention deficit hyperactivity" "results suggest"                
## [149] "hyperactivity disorder"          "95 confidence interval"         
## [151] "trial registration"              "social support"                 
## [153] "ptsd symptoms"                   "autism spectrum disorder"       
## [155] "attention deficit"               "confidence interval"            
## [157] "systematic review"               "major depressive disorder"      
## [159] "physical activity"               "health outcomes"                
## [161] "posttraumatic stress disorder"   "psychiatric disorders"          
## [163] "symptom clusters"                "randomized controlled trial"    
## [165] "spectrum disorder"               "findings suggest"               
## [167] "public health"                   "cohort study"                   
## [169] "latent profile analysis"         "major depressive"               
## [171] "95 confidence"                   "white matter"                   
## [173] "cross sectional study"           "profile analysis"               
## [175] "sectional study"                 "current study"                  
## [177] "depressive disorder"             "resonance imaging"              
## [179] "magnetic resonance imaging"      "meta analysis"                  
## [181] "autism spectrum"                 "health care"                    
## [183] "latent profile"                  "randomized controlled"          
## [185] "primary care"                    "magnetic resonance"             
## [187] "posttraumatic stress"            "bipolar disorder"               
## [189] "study aimed"                     "stress disorder"                
## [191] "19 pandemic"                     "covid 19 pandemic"              
## [193] "mental disorders"                "cluster analysis"               
## [195] "functional connectivity"         "cluster randomized"             
## [197] "controlled trial"                "logistic regression"            
## [199] "latent class analysis"           "class analysis"                 
## [201] "healthy controls"                "cognitive impairment"           
## [203] "risk factors"                    "alzheimer's disease"            
## [205] "cross sectional"                 "depressive symptoms"            
## [207] "latent class"                    "subtype"                        
## [209] "clustering"                      "classes"                        
## [211] "covid 19"                        "latent"                         
## [213] "subtypes"                        "95 ci"                          
## [215] "class"                           "mental health"                  
## [217] "cluster"

Screening using Bag-of-Words

Manual evaluation

In this section we look more closely into the selected papers that published in top journals that publish most of clustering papers.

Extract results for top journals

Next we look at a list of journals published most of papers potentially related to clustering.

Journals<-naive_results %>% 
    group_by(journal) %>% 
    tally() %>% 
    na.omit() %>% 
    arrange(desc(n))
nrow(Journals)
## [1] 598
head(Journals,20)
## # A tibble: 20 × 2
##    journal                                                               n
##    <chr>                                                             <int>
##  1 Journal of affective disorders                                       40
##  2 Frontiers in psychiatry                                              32
##  3 International journal of environmental research and public health    25
##  4 PloS one                                                             20
##  5 Journal of psychiatric research                                      19
##  6 Psychiatry research                                                  18
##  7 Scientific reports                                                   18
##  8 Psychological medicine                                               17
##  9 BMC psychiatry                                                       16
## 10 Child abuse & neglect                                                16
## 11 Child psychiatry and human development                               15
## 12 Frontiers in psychology                                              15
## 13 Addictive behaviors                                                  13
## 14 BMJ open                                                             13
## 15 Journal of interpersonal violence                                    13
## 16 Molecular psychiatry                                                 13
## 17 Addiction (Abingdon, England)                                        12
## 18 European child & adolescent psychiatry                               12
## 19 BMC public health                                                    11
## 20 Social psychiatry and psychiatric epidemiology                       11

Here we choose to evaluate papers published in a few journals (broader scope mental health journals with an impact factor of 3 or above).

journals<-c( "Journal of affective disorders",
             "Frontiers in psychiatry",
             "BMC psychiatry",
             "Psychological medicine",
             "Journal of psychiatric research",
             "Frontiers in psychology")

naive_selected<-naive_results %>% 
    filter(journal %in% journals)
dim(naive_selected)
## [1] 139   9

139 publications were identified.

Manual screen titles and abstract

Next we manually screened the title and abstract using the screen_abstracts function from the revtools package.

Inclusion and exclusion criteria

  • All study applied any clustering methods were included.

  • Studies using clustering results from a different study were excluded

screen <- read_bibliography("pubmed.nbib")
screen<-filter(screen, title %in% naive_selected$title)
screen_abstracts(screen)

Save file for full text review

96 publications were screened out for full text review and data extraction. Two publications were further identified as not using clustering methods in full text review. Data extraction from 94 publications were conducted by Caroline Gao and Johanna Bayer.

# prepare data extraction file 
screened<-read_csv(here::here("Screened_papers.csv"),
                   show_col_types=F)
Full_text<-naive_selected %>% 
    filter(title %in% screened$title) %>% 
    mutate(year=substr(date_published,1,4),
           ID=paste(gsub( " .*$", "", author ), year)) %>% 
    group_by(ID) %>% 
    mutate(n=seq(n()),N=n()) %>% 
    mutate(ID=ifelse(N==1,ID, paste0(ID,letters[n]))) %>% 
    select(ID,title,DOI, author, year,journal, abstract)  
write_csv(Full_text,"Full text review.csv")

Results

Data extraction results were listed below.

Results<-as_tibble(read_excel("Full text reviewed.xlsx")) %>% 
   mutate_at(vars(`Pre-registration` ,`Reported method selecting number of clusters`:`Avaliable data`), 
                   function(x) factor(as.numeric(x), levels=c(0,1), labels=c("No","Yes")))
## Warning in factor(as.numeric(x), levels = c(0, 1), labels = c("No", "Yes")): NAs
## introduced by coercion

## Warning in factor(as.numeric(x), levels = c(0, 1), labels = c("No", "Yes")): NAs
## introduced by coercion

## Warning in factor(as.numeric(x), levels = c(0, 1), labels = c("No", "Yes")): NAs
## introduced by coercion

## Warning in factor(as.numeric(x), levels = c(0, 1), labels = c("No", "Yes")): NAs
## introduced by coercion

## Warning in factor(as.numeric(x), levels = c(0, 1), labels = c("No", "Yes")): NAs
## introduced by coercion
Results %>% 
    select(-Abstract) %>% 
    reactable(
          fullWidth = FALSE,
          filterable = TRUE, 
          compact    = TRUE, # for minimum row height
          striped    = TRUE, # banded rows
          resizable  = TRUE,
          defaultPageSize = 10)

The distribution of modelling approaches is surmised with the following figure. Over half of the publications applied either LCA or LPA.

Results %>%
    group_by(Method) %>% 
    summarise(N=n()) %>% 
    mutate(N=N/94) %>% 
    ggplot(aes(y = reorder(Method,-N),
               x=N)) +
    geom_bar(stat="identity")+ 
    theme_bw() + 
    labs(y="", x= "Number of publications") +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0,0.4)) +
    geom_text(aes(label= paste0(round(N*100),"%")), position=position_dodge(width=0.9), hjust=-0.2)

Distributions of modelling choices were found similar between selected journals with Journal of Affective Disorders published more clustering papers compared with other journals.

tb <- " Model-based clustering | Latent Class Analysis (LCA)
Model-based clustering | Latent Profile Analysis (LPA) 
Model-based clustering | Growth Mixture Modelling (GMM)
Model-based clustering | Latent Class Growth Analysis (LCGA)
Model-based clustering | Latent Class Factor Analysis (LCFA)
Model-based clustering | Structural Equation Modeling (SEM)
Model-based clustering | Latent Class Mixed Model (LCMM)
Centre-based partitioning clustering | K-means
Centre-based partitioning clustering | Partition Around Medoids (PAM)
Hierarchical clustering | Hierarchical clustering
Other | Dynamic Time Warp
Other | Hierarchical clustering optimised with k-means
Unclear | Unclear
"
tb <- read.delim(textConnection(tb),header=FALSE,
                 sep="|",strip.white=TRUE,stringsAsFactors=FALSE)
names(tb)<-c("Method type", "Method")

Model_type<-Results %>% 
  left_join(tb) %>% 
  mutate(`Method type`=factor(`Method type`,
                              levels= c("Model-based clustering",
                                        "Centre-based partitioning clustering",
                                        "Hierarchical clustering",
                                        "Other",
                                        "Unclear")),
         Journal= factor(Journal,
                         levels=c("Journal of affective disorders",
                                  "Frontiers in psychiatry",
                                  "BMC psychiatry",
                                  "Journal of psychiatric research",
                                  "Psychological medicine", 
                                  "Frontiers in psychology")))
## Joining, by = "Method"
table(Model_type$`Method type`)
## 
##               Model-based clustering Centre-based partitioning clustering 
##                                   70                                   12 
##              Hierarchical clustering                                Other 
##                                    6                                    3 
##                              Unclear 
##                                    3
Model_type  %>% 
  group_by(Journal,`Method type`) %>% 
  summarise(Numbers=n()) %>% 
  ggplot( aes(fill=`Method type`, values=Numbers)) + 
  geom_waffle(color = "white", size=.25, n_rows = 1, flip = F, show.legend = T) +
  facet_wrap(Journal~.,nrow= 6, strip.position = "left") +
  scale_y_discrete(expand=c(0,0)) +
  coord_equal() +
  labs(
    y = "",
    x = "Number of publicatons",
    fill= ""
  ) +
  theme_minimal() +
  theme(panel.grid = element_blank(), 
        strip.text.y.left = element_text(angle = 0,hjust = 0),
        axis.ticks.x = element_blank(),
        axis.text = element_blank(),
        legend.position = "bottom") +
  scale_fill_manual(values=c("#08519c", "#4292c6",  "#9ecae1", "#deebf7", "#969696")) 
## `summarise()` has grouped output by 'Journal'. You can override using the
## `.groups` argument.

Over 80% of publications reported methods for choosing best number of custers and established additional models to validate meaningfulness of the clustering results. However only about 5% of publications applied Cross Validation (CV). Less than 30% used other resampling (mainly bootstrap likelihood ratio test was used) or randomisation (random initialization) methods. Only 5 publications provided source data, and among them only one published analysis code.

Results %>% 
  select(ID,`Pre-registration`,`Reported method selecting number of clusters`:`Avaliable data`) %>% 
  pivot_longer(cols=`Pre-registration`:`Avaliable data`,
                 names_to="Indicator",values_to= "Type") %>% 
  mutate(Type=as.numeric(Type)-1) %>% 
  group_by(Indicator) %>% 
  summarise(N=sum(Type,na.rm = T)) %>% 
  mutate(N=N/96) %>% 
  ggplot(aes(y = reorder(Indicator,-N),
               x=N)) +
    geom_bar(stat="identity")+ 
    theme_bw() + 
    labs(y="", x= "Percentage of publications") +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0,1)) +
    geom_text(aes(label= paste0(round(N*100),"%")), position=position_dodge(width=0.9), hjust=-0.2)